Libraries and Data

library(pacman)
pacman::p_load(
  tidyquant,
  readr,
  ggplot2,
  dplyr,
  knitr,
  ggthemes
)

# data available at
# https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide
data <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", na.strings = "", fileEncoding = "UTF-8-BOM")

Covid Library

Documentation

Another useful way to get data and daily updates is with the covid library which can be installed as follows:

devtools::install_github("RamiKrispin/coronavirus")

We can then include it and get updates with

library(coronavirus)
update_dataset()
## No updates are available

To use the dataset we can do the following:

data("coronavirus")

And then take a look and the data we have available

glimpse(coronavirus)
## Rows: 135,020
## Columns: 7
## $ date     <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26,…
## $ province <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ country  <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ lat      <dbl> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, …
## $ long     <dbl> 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, …
## $ type     <chr> "confirmed", "confirmed", "confirmed", "confirmed", "confirm…
## $ cases    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Now let’s try a simple plot of UK confirmed case count

uk <- coronavirus %>%
  filter(country == "United Kingdom") %>%
  filter(type == "confirmed" & cases > 0 & province == "") %>%
  select(-lat, -long, -country) %>%
  arrange(date)

uk
ggplot(uk, aes(date, cases)) +
  geom_line() +
  geom_point(alpha=0.2) +
  geom_smooth(se=FALSE) +
  labs(
    title = "Confirmed UK Coronavirus Cases",
    x = "Confirmed Cases",
    y = "Date"
  ) +
  theme_tufte()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Some simple examples

Top 20 countries

library(dplyr)

summary_df <- coronavirus %>% 
  filter(type == "confirmed") %>%
  group_by(country) %>%
  summarise(total_cases = sum(cases)) %>%
  arrange(-total_cases)
## `summarise()` ungrouping output (override with `.groups` argument)
summary_df

New cases in past 24 hours

library(tidyr)
coronavirus %>% 
  filter(date == max(date)) %>%
  select(country, type, cases) %>%
  group_by(country, type) %>%
  summarise(total_cases = sum(cases)) %>%
  pivot_wider(names_from = type,
              values_from = total_cases) %>%
  arrange(-confirmed)
## `summarise()` regrouping output by 'country' (override with `.groups` argument)

Plotting cases

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
coronavirus %>% 
  group_by(type, date) %>%
  summarise(total_cases = sum(cases)) %>%
  pivot_wider(names_from = type, values_from = total_cases) %>%
  arrange(date) %>%
  mutate(active = confirmed - death - recovered) %>%
  mutate(active_total = cumsum(active),
                recovered_total = cumsum(recovered),
                death_total = cumsum(death)) %>%
  plot_ly(x = ~ date,
                  y = ~ active_total,
                  name = 'Active', 
                  fillcolor = '#1f77b4',
                  type = 'scatter',
                  mode = 'none', 
                  stackgroup = 'one') %>%
  add_trace(y = ~ death_total, 
             name = "Death",
             fillcolor = '#E41317') %>%
  add_trace(y = ~recovered_total, 
            name = 'Recovered', 
            fillcolor = 'forestgreen') %>%
  layout(title = "Distribution of Covid19 Cases Worldwide",
         legend = list(x = 0.1, y = 0.9),
         yaxis = list(title = "Number of Cases"),
         xaxis = list(title = "Source: Johns Hopkins University Center for Systems Science and Engineering"))
## `summarise()` regrouping output by 'type' (override with `.groups` argument)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Teemap plot

conf_df <- coronavirus %>% 
  filter(type == "confirmed") %>%
  group_by(country) %>%
  summarise(total_cases = sum(cases)) %>%
  arrange(-total_cases) %>%
  mutate(parents = "Confirmed") %>%
  ungroup() 
## `summarise()` ungrouping output (override with `.groups` argument)
  plot_ly(data = conf_df,
          type= "treemap",
          values = ~total_cases,
          labels= ~ country,
          parents=  ~parents,
          domain = list(column=0),
          name = "Confirmed",
          textinfo="label+value+percent parent")

Cumulative Global Data

The table above shows the cumulative confirmed cases of COVID-19 worldwide by date. Just reading numbers in a table makes it hard to get a sense of the scale and growth of the outbreak. Let’s draw a line plot to visualize the confirmed cases worldwide.

daily_global_counts

We do the cumulative counts

daily_global_counts$cum_cases = cumsum(daily_global_counts$total_cases)
daily_global_counts$cum_deaths = cumsum(daily_global_counts$total_deaths)

And plot

cum_cases_plot <- ggplot(daily_global_counts, aes(Date, cum_cases)) +
  geom_point() + 
  geom_line() +
  labs(xlab = "Date", ylab="Cumulative confirmed cases", title="Cumulative COVID-19 confirmed cases over time")

cum_cases_plot

And the log10 version on the y axis

cum_cases_plot + 
  scale_y_log10() +
  labs(
    title = "Cumulative cases (log10)",
    caption = "Source: "
  )

By country

Let’s now group by countries and look at individual national trends across the world

by_country <- data %>%
  group_by(countriesAndTerritories) %>%
  summarise(
    total_cases = sum(cases),
    total_deaths = sum(deaths)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
by_country

This is useful but it would also be useful to have some cumulative case and death counts for every country so that we can filter countries we wish to examine

top_cases_by_country <- by_country %>%
  filter(total_cases > 100000) %>%
  arrange(desc(total_cases))
top_cases_by_country

We can show this on a vertical barchart

library(RColorBrewer)
palette <- brewer.pal(5, "RdYlBu")[-(2:4)]

global_mean <- median(top_cases_by_country$total_cases)
x_start <- global_mean + 20000
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5

ggplot(top_cases_by_country, aes(total_cases, countriesAndTerritories, color=total_cases)) +
  geom_point(size = 6) +
  geom_segment(aes(xend=100000, yend=countriesAndTerritories), size=1) +
  geom_text(aes(label = total_cases), color="white", size=1.5) +
  scale_x_continuous("", expand=c(0,0), limits=c(50000,2000000), position="top") +
  scale_color_gradientn(colors = palette) +
  labs(
    title = 'Top Countries by Confirmed Cases',
    captions = 'Source: opendata.org'
  ) +
  theme_minimal() +
  theme(
    axis.line.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(color = "black"),
    legend.position = "none"
  ) +
  geom_vline(xintercept = global_mean, color="grey40", linetype=3) +
    annotate(
    "text", 
    x = x_start, 
    y = y_start, 
    label="The\nglobal\naverage",
    vjust = 1, size = 3, color="grey40"
  ) +
  annotate(
    "curve",
    x = x_start, y=y_start,
    xend = x_end, yend = y_end,
    arrow = arrow(
      length = unit(0.2, "cm"),
      type = "closed"
    ),
    color="grey40"
  )
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_text).

4. Normal vs under-reporting of deaths

Let’s make a faceted view for each country of the cumulative cases Displaying multiple countries as a facet

by_country <- data %>%
  group_by(countriesAndTerritories, Date) %>%
  filter(countriesAndTerritories %in% c('China', 'Italy', 'Sweden', 'France', 'Iran', 'Brazil', 'India')) %>%
  summarise(
    total_cases = sum(cases),
    total_deaths = sum(deaths)
  ) %>%
  mutate(
    cum_cases = cumsum(total_cases),
    cum_deaths = cumsum(total_deaths)
  )
## `summarise()` regrouping output by 'countriesAndTerritories' (override with `.groups` argument)
by_country
ggplot(by_country) + 
  geom_line(aes(Date, total_cases), color='blue') +
  facet_wrap(~countriesAndTerritories, ncol=2, scale="free_y") +
  geom_col(aes(Date, total_deaths), color='red') +
  labs(
    title="Cases and Deaths for COVID-19",
    subtitle="Shown for select countries with stark\ndifference between reported case/death ratios",
    ylab = "Cases & Deaths"
  ) +
  theme_clean()

It is clear that in some countries there under-reporting of deaths. Let’s separate out some key suspects

under_reporting <- by_country %>%
  filter(countriesAndTerritories %in% c('China', 'Brazil', 'Iran'))

ggplot(under_reporting) + 
  geom_line(aes(Date, total_cases), color="steelblue") +
  facet_wrap(~countriesAndTerritories) +
  geom_col(aes(Date, total_deaths), color='red') +
  labs(
    title="Countries potentially underreporting case/death ratio"
  ) +
  theme_tq()

Compared to more typical countries

ord_reporting <- by_country %>%
  filter(countriesAndTerritories %in% c('France', 'Italy', 'United_Kingdom'))

ggplot(ord_reporting) + 
  geom_line(aes(Date, total_cases), color="steelblue") +
  facet_wrap(~countriesAndTerritories) +
  geom_col(aes(Date, total_deaths), color='red') +
  labs(
    title="Countries reporting typical case/death ratios"
  ) +
  theme_tq()

5. UK Cases Rate and Rolling Functions

Prepare the data as usual by getting

uk_data <- data %>%
  filter(countriesAndTerritories == 'United_Kingdom') %>%
  group_by(Date) %>%
  arrange(Date) %>%
  summarise(
    total_cases = sum(cases),
    total_deaths = sum(deaths)
  ) %>%
  mutate(
    cum_cases = cumsum(total_cases),
    cum_deaths = cumsum(total_deaths)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
  # summarize(
  #   cum_sizes  = cumsum(cases),
  #   cum_deaths = cumsum(deaths)
  # )
uk_data

Now let’s plot the total daily cases and the cumulative cases:

require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
plot1 <- ggplot(uk_data, aes(Date, total_cases)) +
  geom_line() +
  labs(
    title = 'UK Cases per day',
    subtitle = 'Jan, 2020 to June 2020',
    caption = 'Source: Open Data',
    ylab = 'Total cases'
  ) +
  ylim(0, 7500)
plot2 <- ggplot(uk_data, aes(Date, total_deaths)) +
  geom_line() +
  labs(
    title = 'UK Deaths per day',
    subtitle = 'Jan, 2020 to June 2020',
    caption = 'Source: Open Data',
    ylab = 'Total Deaths'
  ) +
  ylim(0, 7500)

grid.arrange(plot1, plot2, nrow=1)

library(tidyquant)
library(zoo)

uk_roll_mean <- uk_data %>%
  tq_mutate(
    select = total_cases,
    mutate_fun = rollapply,
    width = 30,
    align = 'right',
    FUN = mean,
    na.rm = TRUE,
    col_rename = "mean_30"
  ) %>%
 tq_mutate(
    select     = total_cases,
    mutate_fun = rollapply,
    width      = 90,
    align      = "right",
    FUN        = mean,
    na.rm      = TRUE,
    col_rename = "mean_90"
  )

uk_roll_mean %>%
  ggplot(aes(Date, total_cases)) +
    # geom_point(alpha=0.2) +
    geom_line(aes(Date, mean_30), color= palette_light()[[1]], size = 1, linetype = 1) +
    geom_line(aes(Date, mean_90), color= palette_light()[[2]], size = 1, linetype = 1) +
    labs(
      title = "UK Cases per day",
      subtitle = "Showing 30 and 90 days moving averages"
    ) +
    scale_color_tq() +
    theme(legend.position = "none")
## Warning: Removed 29 row(s) containing missing values (geom_path).
## Warning: Removed 89 row(s) containing missing values (geom_path).

Rolling custom functions

# Custom function to return mean, sd, 95% conf interval
custom_rolling_stats <- function(x, na.rm = TRUE) {
    # x     = numeric vector
    # na.rm = boolean, whether or not to remove NA's
    
    m  <- mean(x, na.rm = na.rm)
    s  <- sd(x, na.rm = na.rm)
    hi <- m + 2*s
    lo <- m - 2*s
    
    ret <- c(mean = m, stdev = s, hi.95 = hi, lo.95 = lo) 
    return(ret)
}

Now we can use this function on our UK cases data

uk_data_rollstats <- uk_data %>%
  tq_mutate(
    select = total_cases,
    mutate_fun = rollapply,
    width = 30,
    align = "right",
    by.column = FALSE,
    FUN = custom_rolling_stats,
    na.rm = TRUE
  )

uk_data_rollstats

We now have the data to view

  1. Rolling average (trend)
  2. 95% confidence interval (volatility)

Let’s plot this in a Bollinger Bands style

uk_data_rollstats %>%
  ggplot(aes(Date)) +
  geom_point(aes(y=total_cases), color="grey40", alpha=0.5) +
  geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
  geom_line(aes(y = mean), linetype = 2, size = 1, alpha = 0.7, color="red") +
  labs(
    title = "UK Cases by day", x = "",
    subtitle = "30-Day Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)"
  ) + 
  scale_color_tq() +
  theme_tq() +
  theme(
    legend.position = "None"
  )
## Warning: Removed 29 row(s) containing missing values (geom_path).

Now let’s try this and play with a variable for the width of the rolling average called roll_width() and also perform the analysis on death data

rollwidth <- 7

# cases plot
plot_1 <- uk_data %>%
  tq_mutate(
    select = total_cases,
    mutate_fun = rollapply,
    width = rollwidth,
    align = "left",
    by.column = FALSE,
    FUN = custom_rolling_stats,
    na.rm = TRUE
  ) %>%
  ggplot(aes(x = Date)) +
  geom_point(aes(y=total_cases), color="grey40", alpha=0.5, position = 'stack') +
  geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
  geom_line(aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
  labs(
    title = "UK Cases by day", x = "",
    subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
    caption = "Open Data",
    ylab = "Total Cases"
  ) + 
  scale_color_tq() +
  theme_tq() +
  theme(
    legend.position = "None"
  )

# deaths plot
plot_2 <- uk_data %>%
  tq_mutate(
    select = total_deaths,
    mutate_fun = rollapply,
    width = rollwidth,
    align = "left",
    by.column = FALSE,
    FUN = custom_rolling_stats,
    na.rm = TRUE
  ) %>%
  ggplot(aes(x = Date)) +
  geom_point(aes(y=total_deaths), color="grey40", alpha=0.5, position = 'stack') +
  geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
  geom_line(aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
  labs(
    title = "UK Deaths by day", x = "",
    subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
    caption = "Open Data",
    ylab = "Total Cases"
  ) + 
  scale_color_tq() +
  theme_tq() +
  theme(
    legend.position = "None"
  )

grid.arrange(plot_1, plot_2, nrow=1)
## Warning: Removed 6 row(s) containing missing values (geom_path).

## Warning: Removed 6 row(s) containing missing values (geom_path).

Now let’s try to normalise the x-axis and plot both on the same axis

cases_roll_data <- uk_data %>%
  tq_mutate(
    select = total_cases,
    mutate_fun = rollapply,
    width = 7,
    align = "right",
    by.column = FALSE,
    FUN = custom_rolling_stats,
    na.rm = TRUE
  )

deaths_roll_data <- uk_data %>%
  tq_mutate(
    select = total_deaths,
    mutate_fun = rollapply,
    width = 7,
    align = "right",
    by.column = FALSE,
    FUN = custom_rolling_stats,
    na.rm = TRUE
  )

    
  ggplot(NULL, aes(x = Date)) +
  geom_point(data=cases_roll_data, aes(y=total_cases), color="grey40", alpha=0.5, position = 'stack') +
  geom_ribbon(data=cases_roll_data, aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
  geom_line(data=cases_roll_data, aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
  geom_line(data=deaths_roll_data, aes(y = mean), linetype = 1, size = 0.5, alpha = 0.5, color="red") +
  geom_ribbon(data=deaths_roll_data, aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
  labs(
    title = "UK Cases & Deaths per day", x = "Date", y = "Cases & Deaths",
    subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
    caption = "Open Data",
    ylab = "Total Cases"
  ) + 
  scale_color_tq() +
  theme_tq() +
  theme(
    legend.position = "None"
  )
## Warning: Removed 6 row(s) containing missing values (geom_path).

## Warning: Removed 6 row(s) containing missing values (geom_path).